#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.

"""Multi-view test a video classification model."""

import numpy as np
import slowfast.utils.checkpoint as cu
import slowfast.utils.distributed as du
import slowfast.utils.logging as logging
import torch
import torch.distributed as dist
import torch.nn as nn
from slowfast.datasets import loader
from slowfast.models import build_model
import math

logger = logging.get_logger(__name__)

def get_epoch_lr(cur_epoch, pretrain_finetune=True):
    if pretrain_finetune:
        return 0.1 * (math.cos(
            math.pi * cur_epoch / 50
        ) + 1.0) * 0.5


def set_lr(optimizer, new_lr):
    for param_group in optimizer.param_groups:
        param_group["lr"] = new_lr


def perform_test(train_loader, model, cfg, writer=None):
    """
    For classification:
    Perform mutli-view testing that uniformly samples N clips from a video along
    its temporal axis. For each clip, it takes 3 crops to cover the spatial
    dimension, followed by averaging the softmax scores across all Nx3 views to
    form a video-level prediction. All video predictions are compared to
    ground-truth labels and the final testing performance is logged.
    For detection:
    Perform fully-convolutional testing on the full frames without crop.
    Args:
        test_loader (loader): video testing loader.
        model (model): the pretrained video model to test.
        test_meter (TestMeter): testing meters to log and ensemble the testing
            results.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
        writer (TensorboardWriter object, optional): TensorboardWriter object
            to writer Tensorboard log.
    """
    class Classifier(nn.Module):
        def __init__(self):
            super(Classifier, self).__init__()
            self.video_clf = nn.Linear(2048, 400)
            self.audio_clf = nn.Linear(2048, 400)

        def forward(self, audio, video):
            audio_hat = self.audio_clf(audio)
            video_hat = self.video_clf(video)
            return audio_hat, video_hat

    classifier = Classifier().cuda(device=torch.cuda.current_device())
    classifier = nn.parallel.DistributedDataParallel(classifier, device_ids=[torch.cuda.current_device()], output_device=torch.cuda.current_device())
    optimizer = torch.optim.SGD(
        classifier.parameters(),
        lr=0.1,
        momentum=0.9,
        weight_decay=1e-4,
    )
    data_size = len(train_loader)
    model.eval()

    for epoch in range(50):
        audio_correct_sum = 0
        video_correct_sum = 0
        count_sum = 0
        for cur_iter, (audio, video, labels, video_idx, meta) in enumerate(train_loader):
            lr = get_epoch_lr(epoch + cur_iter / data_size, pretrain_finetune=True)
            set_lr(optimizer, lr)

            if cfg.NUM_GPUS:
                # Transfer the data to the current GPU device.
                if isinstance(video, (list,)):
                    for i in range(len(video)):
                        video[i] = video[i].cuda(non_blocking=True)
                else:
                    video = video.cuda(non_blocking=True)

                # Transfer the data to the current GPU device.
                labels = labels.cuda()
                audio = audio.cuda(non_blocking=True)

            # Perform the forward pass.
            with torch.no_grad():
                audio_f, video_f = model.module.get_features((audio, video))
            audio_hat, video_hat = classifier(audio_f, video_f)
            count_sum += audio_hat.shape[0]
            audio_correct_sum += (audio_hat.argmax(1) == labels).sum().item()
            video_correct_sum += (video_hat.argmax(1) == labels).sum().item()

            loss = nn.CrossEntropyLoss()(audio_hat, labels) + nn.CrossEntropyLoss()(video_hat, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if dist.get_rank() == 0 and cur_iter % 10 == 0:
                print("pretrain_finetune, lr:{}, epoch:{}, iter:{}, audio_train_acc:{}, video_train_acc:{}"
                      "".format(lr, epoch, cur_iter,audio_correct_sum/count_sum, video_correct_sum/count_sum))
        if dist.get_rank() == 0 and epoch % 5 == 0:
            torch.save(classifier.state_dict(), "umt_linear.pt")

def train(cfg):
    """
    Perform multi-view testing on the pretrained video model.
    Args:
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    # Set up environment.
    du.init_distributed_training(cfg)
    # Set random seed from configs.
    np.random.seed(cfg.RNG_SEED)
    torch.manual_seed(cfg.RNG_SEED)

    # Setup logging format.
    logging.setup_logging(cfg.OUTPUT_DIR)

    # Print config.
    logger.info("Test with config:")
    logger.info(cfg)

    # Build the video model and print model statistics.
    model = build_model(cfg)

    # if du.is_master_proc() and cfg.LOG_MODEL_INFO:
    #     misc.log_model_info(model, cfg, use_train_input=False)

    cu.load_test_checkpoint(cfg, model)

    # Create video testing loaders.
    train_loader = loader.construct_loader(cfg, "train")
    logger.info("train model for {} iterations".format(len(train_loader)))


    # # Perform multi-view test on the entire dataset.
    test_meter = perform_test(train_loader, model, cfg=cfg)

